knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
# attach packages
library(tidyverse)
library(here)
library(tidytext)
library(textdata)
library(pdftools)
library(ggwordcloud)
library(kableExtra)
Get the lyrics for Act 1 and Act 2
ham_act1 <- pdf_text(here("data", "hamilton_act1.pdf"))
ham_act2 <- pdf_text(here("data", "hamilton_act2.pdf"))
Convert text to a dataframe
act1_lines <- data.frame(ham_act1) %>%
mutate(page = 1:n()) %>%
mutate(text_act1 = str_split(ham_act1, pattern = "\\n")) %>%
unnest(text_act1) %>%
mutate(text_act1 = str_trim(text_act1))
act2_lines <- data.frame(ham_act2) %>%
mutate(page = 1:n()) %>%
mutate(text_act2 = str_split(ham_act2, pattern = "\\n")) %>%
unnest(text_act2) %>%
mutate(text_act2 = str_trim(text_act2))
Do some tidying
tunes_act1 <- act1_lines %>%
mutate(song = ifelse(str_detect(text_act1, "Song"), text_act1, NA)) %>%
fill(song, .direction = "down") %>%
separate(col = song, into = c("so", "no"), sep = " ") %>%
mutate(song = as.numeric(as.roman(no)))
tunes_act2 <- act2_lines %>%
mutate(song = ifelse(str_detect(text_act2, "Song"), text_act2, NA)) %>%
fill(song, .direction = "down") %>%
separate(col = song, into = c("so", "no"), sep = " ") %>%
mutate(song = as.numeric(as.roman(no)))
Find word count per act by song
words_act1 <- tunes_act1 %>%
unnest_tokens(word, text_act1) %>%
select(-ham_act1)
words_act2 <- tunes_act2 %>%
unnest_tokens(word, text_act2) %>%
select(-ham_act2)
act1_wordcount <- words_act1 %>%
count(song, word)
act2_wordcount <- words_act2 %>%
count(song, word)
Remove Stop Words
# head(stop_words)
words_act1_clean <- words_act1 %>%
anti_join(stop_words, by = "word")
words_act2_clean <- words_act2 %>%
anti_join(stop_words, by = "word")
act1_nonstop_counts <- words_act1_clean %>%
count(song, word)
act2_nonstop_counts <- words_act2_clean %>%
count(song, word)
Find top 5 words from each song in each Act
act1_top5_words <- act1_nonstop_counts %>%
group_by(song) %>%
arrange(-n) %>%
slice(1:5) %>%
ungroup()
act2_top5_words <- act2_nonstop_counts %>%
group_by(song) %>%
arrange(-n) %>%
slice(1:5) %>%
ungroup()
Act 1
Songs 1-5
act1_1_5 <- act1_top5_words %>%
filter(song %in% c("1", "2", "3", "4", "5")) %>%
mutate(song = case_when(
song == "1" ~ "Alexander Hamilton",
song == "2" ~ "Aaron Burr, Sir",
song == "3" ~ "My Shot",
song == "4" ~ "The Story of Tonight",
song == "5" ~ "The Schuyler Sisters"
))
ggplot(data = act1_1_5,
aes(x = n, y = word)) +
geom_col() +
facet_wrap(~song, scales = "free")

Songs 6-10
act1_6_10 <- act1_top5_words %>%
filter(song %in% c("6", "7", "8", "9", "10")) %>%
mutate(song = case_when(
song == "6" ~ "Farmer Refuted",
song == "7" ~ "You'll Be Back",
song == "8" ~ "Right Hand Man",
song == "9" ~ "A Winter's Ball",
song == "10" ~ "Helpless"
))
ggplot(data = act1_6_10,
aes(x = n, y = word)) +
geom_col() +
facet_wrap(~song, scales = "free")

Songs 11-15
act1_11_15 <- act1_top5_words %>%
filter(song %in% c("11", "12", "13", "14", "15")) %>%
mutate(song = case_when(
song == "11" ~ "Satisfied",
song == "12" ~ "The Story of \nTonight (Reprise)",
song == "13" ~ "Wait for It",
song == "14" ~ "Stay Alive",
song == "15" ~ "Ten Duel \nCommandments"
))
ggplot(data = act1_11_15,
aes(x = n, y = word)) +
geom_col() +
facet_wrap(~song, scales = "free")

Songs 16-20
act1_16_20 <- act1_top5_words %>%
filter(song %in% c("16", "17", "18", "19", "20")) %>%
mutate(song = case_when(
song == "16" ~ "Meet Me Inside",
song == "17" ~ "That Would Be Enough",
song == "18" ~ "Guns and Ships",
song == "19" ~ "History Has Its \nEyes on You",
song == "20" ~ "Yorktown (The World \nTurned Upside Down)"
))
ggplot(data = act1_16_20,
aes(x = n, y = word)) +
geom_col() +
facet_wrap(~song, scales = "free")

Songs 21-23
act1_21_23 <- act1_top5_words %>%
filter(song %in% c("21", "22", "23")) %>%
mutate(song = case_when(
song == "21" ~ "What Comes Next?",
song == "22" ~ "Dear Theodosia",
song == "23" ~ "Non-Stop"
))
ggplot(data = act1_21_23,
aes(x = n, y = word)) +
geom_col() +
facet_wrap(~song, scales = "free")

Act 2
Songs 1-5
act2_1_5 <- act2_top5_words %>%
filter(song %in% c("1", "2", "3", "4", "5")) %>%
mutate(song = case_when(
song == "1" ~ "What'd I Miss",
song == "2" ~ "Cabinet Battle #1",
song == "3" ~ "Take a Break",
song == "4" ~ "Say No to This",
song == "5" ~ "The Room Where \nIt Happens"
))
ggplot(data = act2_1_5,
aes(x = n, y = word)) +
geom_col() +
facet_wrap(~song, scales = "free")

Songs 6-10
act2_6_10 <- act2_top5_words %>%
filter(song %in% c("6", "7", "8", "9", "10")) %>%
mutate(song = case_when(
song == "6" ~ "Schuyler Defeated",
song == "7" ~ "Cabinet Battle #2",
song == "8" ~ "Washington on Your Side",
song == "9" ~ "One Last Time",
song == "10" ~ "I Know Him"
))
ggplot(data = act2_6_10,
aes(x = n, y = word)) +
geom_col() +
facet_wrap(~song, scales = "free")

Songs 11-15
act2_11_15 <- act2_top5_words %>%
filter(song %in% c("11", "12", "13", "14", "15")) %>%
mutate(song = case_when(
song == "11" ~ "The Adams Administration",
song == "12" ~ "We Know",
song == "13" ~ "Hurricane",
song == "14" ~ "The Reynolds Pamphlet",
song == "15" ~ "Burn"
))
ggplot(data = act2_11_15,
aes(x = n, y = word)) +
geom_col() +
facet_wrap(~song, scales = "free")

Songs 16-20
act2_16_20 <- act2_top5_words %>%
filter(song %in% c("16", "17", "18", "19", "20")) %>%
mutate(song = case_when(
song == "16" ~ "Blow Us All Away",
song == "17" ~ "Stay Alive - Reprise",
song == "18" ~ "It's Quiet Uptown",
song == "19" ~ "The Election of 1800",
song == "20" ~ "Your Obedient Servant"
))
ggplot(data = act2_16_20,
aes(x = n, y = word)) +
geom_col() +
facet_wrap(~song, scales = "free")

Songs 21-23
act2_21_23 <- act2_top5_words %>%
filter(song %in% c("21", "22", "23")) %>%
mutate(song = case_when(
song == "21" ~ "Best of Wives and \nBest of Women",
song == "22" ~ "The World Was Wide Enough",
song == "23" ~ "Who Lives, Who Dies, \n Who Tells Your Story"
))
ggplot(data = act2_21_23,
aes(x = n, y = word)) +
geom_col() +
facet_wrap(~song, scales = "free")

Word clouds of top 100 words in each act
act1_top100 <- act1_nonstop_counts %>%
arrange(-n) %>%
slice(1:100)
act2_top100 <- act2_nonstop_counts %>%
arrange(-n) %>%
slice(1:100)
Act 1
act1_cloud <- ggplot(data = act1_top100,
aes(label = word)) +
geom_text_wordcloud(aes(color = n, size = n),
shape = "pentagon",
eccentricity = 0.4) +
scale_size_area(max_size = 9) +
scale_color_gradientn(colors = c("darkgreen", "blue", "purple")) +
theme_minimal()
act1_cloud

Act 2
act2_cloud <- ggplot(data = act2_top100,
aes(label = word)) +
geom_text_wordcloud(aes(color = n, size = n),
shape = "pentagon") +
scale_color_gradientn(colors = c("darkgreen", "blue", "purple")) +
scale_size_area(max_size = 6) +
theme_minimal()
act2_cloud

Sentiment Analysis
“afinn” Lexicon
Act 1
act1_afinn <- words_act1_clean %>%
inner_join(get_sentiments("afinn"), by = "word")
act1_afinn_counts <- act1_afinn %>%
count(song, value)
Songs 1-5
act1_afinn_1_5 <- act1_afinn %>%
count(song, value) %>%
filter(song %in% c("1", "2", "3", "4", "5")) %>%
mutate(song = case_when(
song == "1" ~ "Alexander Hamilton",
song == "2" ~ "Aaron Burr, Sir",
song == "3" ~ "My Shot",
song == "4" ~ "The Story of Tonight",
song == "5" ~ "The Schuyler Sisters"
))
ggplot(data = act1_afinn_1_5,
aes(x = value, y = n)) +
geom_col() +
facet_wrap(~song)

Songs 6-10
act1_afinn_6_10 <- act1_afinn %>%
count(song, value) %>%
filter(song %in% c("6", "7", "8", "9", "10")) %>%
mutate(song = case_when(
song == "6" ~ "Farmer Refuted",
song == "7" ~ "You'll Be Back",
song == "8" ~ "Right Hand Man",
song == "9" ~ "A Winter's Ball",
song == "10" ~ "Helpless"
))
ggplot(data = act1_afinn_6_10,
aes(x = value, y = n)) +
geom_col() +
facet_wrap(~song)

Songs 11-15
act1_afinn_11_15 <- act1_afinn %>%
count(song, value) %>%
filter(song %in% c("11", "12", "13", "14", "15")) %>%
mutate(song = case_when(
song == "11" ~ "Satisfied",
song == "12" ~ "The Story of \nTonight (Reprise)",
song == "13" ~ "Wait for It",
song == "14" ~ "Stay Alive",
song == "15" ~ "Ten Duel \nCommandments"
))
ggplot(data = act1_afinn_11_15,
aes(x = value, y = n)) +
geom_col() +
facet_wrap(~song)

Songs 16 - 20
act1_afinn_16_20 <- act1_afinn %>%
count(song, value) %>%
filter(song %in% c("16", "17", "18", "19", "20")) %>%
mutate(song = case_when(
song == "16" ~ "Meet Me Inside",
song == "17" ~ "That Would Be Enough",
song == "18" ~ "Guns and Ships",
song == "19" ~ "History Has Its \nEyes on You",
song == "20" ~ "Yorktown (The World \nTurned Upside Down)"
))
ggplot(data = act1_afinn_16_20,
aes(x = value, y = n)) +
geom_col() +
facet_wrap(~song)

Songs 21-23
act1_afinn_21_23 <- act1_afinn %>%
count(song, value) %>%
filter(song %in% c("21", "22", "23")) %>%
mutate(song = case_when(
song == "21" ~ "What Comes Next?",
song == "22" ~ "Dear Theodosia",
song == "23" ~ "Non-Stop"
))
ggplot(data = act1_afinn_21_23,
aes(x = value, y = n)) +
geom_col() +
facet_wrap(~song)

Mean
act1_afinn_means <- act1_afinn %>%
group_by(song) %>%
summarize(mean_afinn = mean(value))
ggplot(data = act1_afinn_means,
aes(x = fct_rev(factor(song)),
y = mean_afinn)) +
geom_col() +
coord_flip()

Act 2
act2_afinn <- words_act2_clean %>%
inner_join(get_sentiments("afinn"), by = "word")
act2_afinn_counts <- act2_afinn %>%
count(song, value)
Songs 1-5
act2_afinn_1_5 <- act2_afinn %>%
count(song, value) %>%
filter(song %in% c("1", "2", "3", "4", "5")) %>%
mutate(song = case_when(
song == "1" ~ "What'd I Miss",
song == "2" ~ "Cabinet Battle #1",
song == "3" ~ "Take a Break",
song == "4" ~ "Say No to This",
song == "5" ~ "The Room Where \nIt Happens"
))
ggplot(data = act2_afinn_1_5,
aes(x = value, y = n)) +
geom_col() +
facet_wrap(~song)

Songs 6-10
act2_afinn_6_10 <- act2_afinn %>%
count(song, value) %>%
filter(song %in% c("6", "7", "8", "9", "10")) %>%
mutate(song = case_when(
song == "6" ~ "Schuyler Defeated",
song == "7" ~ "Cabinet Battle #2",
song == "8" ~ "Washington on Your Side",
song == "9" ~ "One Last Time",
song == "10" ~ "I Know Him"
))
ggplot(data = act2_afinn_6_10,
aes(x = value, y = n)) +
geom_col() +
facet_wrap(~song)

Songs 11-15
act2_afinn_11_15 <- act2_afinn %>%
count(song, value) %>%
filter(song %in% c("11", "12", "13", "14", "15")) %>%
mutate(song = case_when(
song == "11" ~ "The Adams Administration",
song == "12" ~ "We Know",
song == "13" ~ "Hurricane",
song == "14" ~ "The Reynolds Pamphlet",
song == "15" ~ "Burn"
))
ggplot(data = act2_afinn_11_15,
aes(x = value, y = n)) +
geom_col() +
facet_wrap(~song)

Songs 16 - 20
act2_afinn_16_20 <- act2_afinn %>%
count(song, value) %>%
filter(song %in% c("16", "17", "18", "19", "20")) %>%
mutate(song = case_when(
song == "16" ~ "Blow Us All Away",
song == "17" ~ "Stay Alive - Reprise",
song == "18" ~ "It's Quiet Uptown",
song == "19" ~ "The Election of 1800",
song == "20" ~ "Your Obedient Servant"
))
ggplot(data = act2_afinn_16_20,
aes(x = value, y = n)) +
geom_col() +
facet_wrap(~song)

Songs 21-23
act2_afinn_21_23 <- act2_afinn %>%
count(song, value) %>%
filter(song %in% c("21", "22", "23")) %>%
mutate(song = case_when(
song == "21" ~ "Best of Wives and \nBest of Women",
song == "22" ~ "The World Was Wide Enough",
song == "23" ~ "Who Lives, Who Dies, \n Who Tells Your Story"
))
ggplot(data = act2_afinn_21_23,
aes(x = value, y = n)) +
geom_col() +
facet_wrap(~song)

Mean
act2_afinn_means <- act2_afinn %>%
group_by(song) %>%
summarize(mean_afinn = mean(value))
ggplot(data = act2_afinn_means,
aes(fct_rev(factor(song)),
y = mean_afinn)) +
geom_col() +
coord_flip()

“NRC” lexicon
Act 1
act1_nrc <- words_act1_clean %>%
inner_join(get_sentiments("nrc"))
act1_nrc_counts <- act1_nrc %>%
count(song, sentiment)
ggplot(data = act1_nrc_counts,
aes(x = sentiment, y = n)) +
geom_col() +
facet_wrap(~song) +
coord_flip()

Songs 1-5
act1_nrc_1_5 <- act1_nrc %>%
count(song, sentiment) %>%
filter(song %in% c("1", "2", "3", "4", "5")) %>%
mutate(song = case_when(
song == "1" ~ "Alexander Hamilton",
song == "2" ~ "Aaron Burr, Sir",
song == "3" ~ "My Shot",
song == "4" ~ "The Story of Tonight",
song == "5" ~ "The Schuyler Sisters"
))
ggplot(data = act1_nrc_1_5,
aes(x = sentiment, y = n)) +
geom_col() +
facet_wrap(~song) +
coord_flip()

Songs 6-10
act1_nrc_6_10 <- act1_nrc %>%
count(song, sentiment) %>%
filter(song %in% c("6", "7", "8", "9", "10")) %>%
mutate(song = case_when(
song == "6" ~ "Farmer Refuted",
song == "7" ~ "You'll Be Back",
song == "8" ~ "Right Hand Man",
song == "9" ~ "A Winter's Ball",
song == "10" ~ "Helpless"
))
ggplot(data = act1_nrc_6_10,
aes(x = sentiment, y = n)) +
geom_col() +
facet_wrap(~song) +
coord_flip()

Songs 11-15
act1_nrc_11_15 <- act1_nrc %>%
count(song, sentiment) %>%
filter(song %in% c("11", "12", "13", "14", "15")) %>%
mutate(song = case_when(
song == "11" ~ "Satisfied",
song == "12" ~ "The Story of \nTonight (Reprise)",
song == "13" ~ "Wait for It",
song == "14" ~ "Stay Alive",
song == "15" ~ "Ten Duel \nCommandments"
))
ggplot(data = act1_nrc_11_15,
aes(x = sentiment, y = n)) +
geom_col() +
facet_wrap(~song) +
coord_flip()

Songs 16-20
act1_nrc_16_20 <- act1_nrc %>%
count(song, sentiment) %>%
filter(song %in% c("16", "17", "18", "19", "20")) %>%
mutate(song = case_when(
song == "16" ~ "Meet Me Inside",
song == "17" ~ "That Would Be Enough",
song == "18" ~ "Guns and Ships",
song == "19" ~ "History Has Its \nEyes on You",
song == "20" ~ "Yorktown (The World \nTurned Upside Down)"
))
ggplot(data = act1_nrc_16_20,
aes(x = sentiment, y = n)) +
geom_col() +
facet_wrap(~song) +
coord_flip()

Songs 21-23
act1_nrc_21_23 <- act1_nrc %>%
count(song, sentiment) %>%
filter(song %in% c("21", "22", "23")) %>%
mutate(song = case_when(
song == "21" ~ "What Comes Next?",
song == "22" ~ "Dear Theodosia",
song == "23" ~ "Non-Stop"
))
ggplot(data = act1_nrc_21_23,
aes(x = sentiment, y = n)) +
geom_col() +
facet_wrap(~song) +
coord_flip()

Act 2
act2_nrc <- words_act2_clean %>%
inner_join(get_sentiments("nrc"))
act2_nrc_counts <- act2_nrc %>%
count(song, sentiment)
ggplot(data = act2_nrc_counts,
aes(x = sentiment, y = n)) +
geom_col() +
facet_wrap(~song) +
coord_flip()

Songs 1-5
act2_nrc_1_5 <- act2_nrc %>%
count(song, sentiment) %>%
filter(song %in% c("1", "2", "3", "4", "5")) %>%
mutate(song = case_when(
song == "1" ~ "What'd I Miss",
song == "2" ~ "Cabinet Battle #1",
song == "3" ~ "Take a Break",
song == "4" ~ "Say No to This",
song == "5" ~ "The Room Where \nIt Happens"
))
ggplot(data = act2_nrc_1_5,
aes(x = sentiment, y = n)) +
geom_col() +
facet_wrap(~song) +
coord_flip()

Songs 6-10
act2_nrc_6_10 <- act2_nrc %>%
count(song, sentiment) %>%
filter(song %in% c("6", "7", "8", "9", "10")) %>%
mutate(song = case_when(
song == "6" ~ "Schuyler Defeated",
song == "7" ~ "Cabinet Battle #2",
song == "8" ~ "Washington on Your Side",
song == "9" ~ "One Last Time",
song == "10" ~ "I Know Him"
))
ggplot(data = act2_nrc_6_10,
aes(x = sentiment, y = n)) +
geom_col() +
facet_wrap(~song) +
coord_flip()

Songs 11-15
act2_nrc_11_15 <- act2_nrc %>%
count(song, sentiment) %>%
filter(song %in% c("11", "12", "13", "14", "15")) %>%
mutate(song = case_when(
song == "11" ~ "The Adams Administration",
song == "12" ~ "We Know",
song == "13" ~ "Hurricane",
song == "14" ~ "The Reynolds Pamphlet",
song == "15" ~ "Burn"
))
ggplot(data = act2_nrc_11_15,
aes(x = sentiment, y = n)) +
geom_col() +
facet_wrap(~song) +
coord_flip()

Songs 16-20
act2_nrc_16_20 <- act2_nrc %>%
count(song, sentiment) %>%
filter(song %in% c("16", "17", "18", "19", "20")) %>%
mutate(song = case_when(
song == "16" ~ "Blow Us All Away",
song == "17" ~ "Stay Alive - Reprise",
song == "18" ~ "It's Quiet Uptown",
song == "19" ~ "The Election of 1800",
song == "20" ~ "Your Obedient Servant"
))
ggplot(data = act2_nrc_16_20,
aes(x = sentiment, y = n)) +
geom_col() +
facet_wrap(~song) +
coord_flip()

Songs 21-23
act2_nrc_21_23 <- act2_nrc %>%
count(song, sentiment) %>%
filter(song %in% c("21", "22", "23")) %>%
mutate(song = case_when(
song == "21" ~ "Best of Wives and \nBest of Women",
song == "22" ~ "The World Was Wide Enough",
song == "23" ~ "Who Lives, Who Dies, \n Who Tells Your Story"
))
ggplot(data = act2_nrc_21_23,
aes(x = sentiment, y = n)) +
geom_col() +
facet_wrap(~song) +
coord_flip()
